library(epitools)
library(aod)
library(calibrate)
srvs <- read.csv('data.csv',head=T)
options(scipen=999) #disables scientific notation

###table1 -- odds ratios
srvs$generatorType <- factor(srvs$generatorType, levels=c(levels(srvs$generatorType),'none'))
srvs$generatorType[is.na(srvs$generatorType)] <- 'none'
genList <- c('none','blogger','drupal','homestead','joomla','typo3','wordpress','zen cart')
gensFactp<-factor(srvs$generatorType[srvs$inPhishList  ==T| srvs$inControlList ==T], levels=genList, ordered=T) #we want to odds compared to none
gensFactc<-factor(srvs$generatorType[srvs$inCloakList  ==T| srvs$inControlList ==T], levels=genList, ordered=T) 
oddsp1 <- oddsratio(gensFactp, srvs$inPhishList[srvs$inPhishList  ==T| srvs$inControlList ==T])
oddsc1 <- oddsratio(gensFactc, srvs$inCloakList[srvs$inCloakList  ==T| srvs$inControlList ==T])

serverList <- c('microsoft','apache','gse','nginx','yts') #gse = google, yts = yahoo
cPinSL <- factor(srvs$inPhishList[is.element(srvs$serverType, serverList) & (srvs$inPhishList  ==T| srvs$inControlList ==T)])
serverFactp<-factor(srvs$serverType[is.element(srvs$serverType, serverList) & (srvs$inPhishList  ==T| srvs$inControlList ==T)], levels=serverList, ordered=T) #we want to odds compared to msft iis
oddsp2 <- oddsratio(serverFactp, cPinSL)
cCinSL <- factor(srvs$inCloakList[is.element(srvs$serverType, serverList) & (srvs$inCloakList  ==T| srvs$inControlList ==T)])
serverFactc<-factor(srvs$serverType[is.element(srvs$serverType, serverList) & (srvs$inCloakList  ==T| srvs$inControlList ==T)], levels=serverList, ordered=T) #we want to odds compared to msft iis
oddsc2 <- oddsratio(serverFactc, cCinSL)


srvs$apache <- srvs$serverType=="apache"
srvs$nginx <- srvs$serverType=="nginx"
srvs$google <- srvs$serverType=="gse"
srvs$yts <- srvs$serverType=="yts"
srvs$other <- srvs$serverType=="other"
srvs$us <- srvs$country=='United States'
srvs$de <- srvs$country=='Germany'
srvs$ca <- srvs$country=="Canada"
srvs$uk <- srvs$country=="United Kingdom"
srvs$fr <- srvs$country=='France'
srvs$au <- srvs$country=="Australia"
srvs$bv <- srvs$country=="Virgin Islands, British"
srvs$jp <- srvs$country=="Japan"
srvs$cn <- srvs$country=="China"
srvs$bh <- srvs$country=='Bahamas'
srvs$genericSvr <- is.na(srvs$serverVersion)
marExp <- read.csv('neims.csv',header=T)
srvsPrime <- merge(srvs, marExp)
srvsPrime<-srvsPrime[! is.na(srvsPrime$MS),]
srvsPrime$lgMS<-log(srvsPrime$MS*(1/100)*(1-.678)*106200000*0.85,2)
srvsPrime$lgMS[srvsPrime$lgMS==-Inf] <- 0


###table2 -- market share regressions
cppReg <- glm(inPhishList ~ lgMS + apache + nginx +google + yts + other + HTTPONLY+genericSvr+sharedHost+us+de+ca+uk+fr+au+bv+jp+cn+bh, data=srvsPrime, family=binomial(link = "logit"))
summary(cppReg) #coef and coef's p-val
exp(cbind(OR = coef(cppReg), confint(cppReg))) #log odds
with(cppReg, null.deviance - deviance) #chi-sq
with(cppReg, pchisq(null.deviance - deviance, df.null - df.residual, lower.tail = FALSE)) #p-val

noCMS <- srvs[! srvs$domain %in% srvsPrime$domain,]
nocmsregP <- glm(inPhishList ~ apache + nginx +google + yts + other + HTTPONLY+genericSvr+sharedHost+us+de+ca+uk+fr+au+bv+jp+cn+bh, data=noCMS, family=binomial(link = "logit"))
summary(nocmsregP) #coef and coef's p-val
exp(cbind(OR = coef(nocmsregP), confint(nocmsregP)))  #log odds #this will take awhile
with(nocmsregP, null.deviance - deviance) #chi-sq
with(nocmsregP, pchisq(null.deviance - deviance, df.null - df.residual, lower.tail = FALSE)) #p-val

cpcReg <- glm(inCloakList ~ lgMS + apache + nginx +google + yts + other + HTTPONLY+genericSvr+sharedHost+us+de+ca+uk+fr+au+bv+jp+cn+bh, data=srvsPrime, family=binomial(link = "logit"))
summary(cpcReg) #coef and coef's p-val
exp(cbind(OR = coef(cpcReg), confint(cpcReg)))  #log odds
with(cpcReg, null.deviance - deviance) #chi-sq
with(cpcReg, pchisq(null.deviance - deviance, df.null - df.residual, lower.tail = FALSE)) #p-val

nocmsregC <- glm(inCloakList ~ apache + nginx +google + yts + other + HTTPONLY+genericSvr+sharedHost+us+de+ca+uk+fr+au+bv+jp+cn+bh, data=noCMS, family=binomial(link = "logit"))
summary(nocmsregC)#coef and coef's p-val
exp(cbind(OR = coef(nocmsregC), confint(nocmsregC)))  #log odds #this will take awhile
with(nocmsregC, null.deviance - deviance) #chi-sq 
with(nocmsregC, pchisq(null.deviance - deviance, df.null - df.residual, lower.tail = FALSE)) #p-val


###Fig2 -- WordPress
wp<-srvsPrime[srvsPrime$generatorType=='wordpress',]
wp<-wp[!is.na(wp$generatorType), ]

wp$generatorVersion[wp$generatorVersion=='3.6'] <- NA
wp$generatorVersion<-factor(wp$generatorVersion)
wp$server<-factor(wp$server)
wp$serverType<-factor(wp$serverType)
wp$serverVersion<-factor(wp$serverVersion)
wp$upToDate<-wp$generatorVersion =='3.5.1'
wp$genWp<-is.na(wp$generatorVersion)
wp$bigWpVsn<-factor(substr(wp$generatorVersion,1,1))
levels(wp$bigWpVsn) <- c(levels(wp$bigWpVsn),'n')
wp$bigWpVsn[is.na(wp$bigWpVsn)] <- 'n'

##Fig2c
oddsw1<-oddsratio(wp[wp$inPhishList | wp$inControlList,]$genWp , wp[wp$inPhishList | wp$inControlList,]$inPhishList)
oddsw1c<-oddsratio(wp[wp$inCloakList | wp$inControlList,]$genWp , wp[wp$inCloakList | wp$inControlList,]$inCloakList)
oddsw2<-oddsratio(wp$upToDate[wp$genWp==F & (wp$inPhishList | wp$inControlList) ], wp$inPhishList[wp$genWp==F & (wp$inPhishList | wp$inControlList)])
oddsw2c<-oddsratio(wp$upToDate[wp$genWp==F & (wp$inCloakList | wp$inControlList)], wp$inCloakList[wp$genWp==F & (wp$inCloakList | wp$inControlList)])
vsnList <- c('n','2','3')
vsnFact<-factor(wp$bigWpVsn[wp$bigWpVsn!='1'& (wp$inPhishList | wp$inControlList)], levels=vsnList, ordered=T, labels=vsnList)
vsnFactc<-factor(wp$bigWpVsn[wp$bigWpVsn!='1'& (wp$inCloakList | wp$inControlList)], levels=vsnList, ordered=T, labels=vsnList)
oddsw3 <- oddsratio(vsnFact, wp$inPhishList[wp$bigWpVsn!='1' & (wp$inPhishList | wp$inControlList)])
oddsw3c <- oddsratio(vsnFactc, wp$inCloakList[wp$bigWpVsn!='1' & (wp$inCloakList | wp$inControlList)])

wpvlist <- c('2.0','2.0.2','2.0.3','2.0.4','2.0.5','2.0.6','2.0.10','2.0.11','2.1','2.1.3','2.2','2.2.1','2.2.2','2.2.3','2.3','2.3.1','2.3.2','2.3.3','2.5','2.5.1','2.6','2.6.1','2.6.2','2.6.3','2.6.5','2.7','2.7.1','2.8','2.8.1','2.8.2','2.8.3','2.8.4','2.8.5','2.8.6','2.9','2.9.1','2.9.2','3.0','3.0.1','3.0.2','3.0.3','3.0.4','3.0.5','3.0.6','3.1','3.1.1','3.1.2','3.1.3','3.1.4','3.2','3.2.1','3.3','3.3.1','3.3.2','3.4','3.4.1','3.4.2','3.5','3.5.1')
wpvtable <- table(factor(wp$generatorVersion[wp$bigWpVsn=='2' | wp$bigWpVsn=='3'],levels=wpvlist,ordered=T, labels=wpvlist))
wpPhishTable<-table(factor(wp$generatorVersion[wp$bigWpVsn=='2' | wp$bigWpVsn=='3'],levels=wpvlist,ordered=T, labels=wpvlist), wp$inPhishList[wp$bigWpVsn=='2' | wp$bigWpVsn=='3'])
colnames(wpPhishTable)<- c("NotaPhish", "Phish")
wpPhishTable <- wpPhishTable[, which(!grepl("NotaPhish", colnames(wpPhishTable)))]


wpvvector <-as.vector(wpvtable)
wppvector<-as.vector(wpPhishTable)
wp$minorVersion <- factor(substr(wp$generatorVersion,1,3))
wp$minorVersion[wp$bigWpVsn=='1'] <- NA
wp$minorVersion <- factor(wp$minorVersion)
wpvtable2 <- table(wp$minorVersion)
wpPhishTable2<-table(factor(wp$minorVersion[wp$bigWpVsn=='2' | wp$bigWpVsn=='3']), wp$inPhishList[wp$bigWpVsn=='2' | wp$bigWpVsn=='3'])
colnames(wpPhishTable2)<- c("NotaPhish", "Phish")
wpPhishTable2 <- wpPhishTable2[, which(!grepl("NotaPhish", colnames(wpPhishTable2)))]
wpCloakTable<-table(factor(wp$minorVersion[wp$bigWpVsn=='2' | wp$bigWpVsn=='3']), wp$inCloakList[wp$bigWpVsn=='2' | wp$bigWpVsn=='3'])
colnames(wpCloakTable)<- c("NotaCloak", "Cloak")
wpCloakTable <- wpCloakTable[, which(!grepl("NotaCloak", colnames(wpCloakTable)))]
wp$isComp <- wp$inPhishList | wp$inCloakList
wp$prettyNames <- ifelse(wp$isComp==T, "  ", " .")
wp$prettyNames <- factor(wp$prettyNames,levels=c(" .","  "), ordered=T)
wpt2 <- table(wp$minorVersion, wp$prettyNames)

#Fig2b
mosaicplot(wpt2,shade=T,las=3, main="", ylab="")
mtext("Compromised         Not Compromised", side=2, cex=1)

#fig2a
par(mar=c(5, 5, 4, 5) + 0.1)
names(wpvtable2)[1]="2.0."
plot(wpvtable2, col='blue', type='p',xlab="WordPress Subversion", ylab='Webserver Dataset Frequency', main='', las=3,xaxt='n',cex.lab=1.4)
names(wpvtable2)[1]="2.0"
par(new=T)
plot(wpPhishTable2, col='darkgreen', type='p', pch=4, xaxt='n', yaxt='n', xlab='', ylab='',ylim=c(0, 850))
points(wpCloakTable, col='red', pch=8,xaxt='n', yaxt='n', xlab='', ylab='',ylim=c(0, 850))
axis(side=1, at=1:15, labels=names(wpvtable2),las=3)
axis(side=4)
mtext("Compromise Frequency", side=4, line=3,cex=1.4)
legend('topleft',c('Webserver Dataset','Phish','Search Redirection'), col=c('blue','darkgreen','red'), pch=c(1,4,8))


#Section 3.3 regression
wpv <- read.table('wordpressVersion.csv',head=T,sep=',', quote='"')
wpv$minorVersion<-factor(wpv$minorVersion)
levels(wpv$minorVersion) <- c(levels(wpv$minorVersion),"2.0","3.0")
wpv$minorVersion[wpv$minorVersion==2]<-"2.0"
wpv$minorVersion[wpv$minorVersion==3]<-"3.0"
wpv$minorVersion<-factor(wpv$minorVersion)
wp <- merge(wp, wpv, all.x=T)

wp$vsnMS <- (wp$percentMaj / 100) * (wp$percentAllCMS / 100) * (1-.678)*106200000*0.85
wp$lgvsnMS <- log(wp$vsnMS, 2)

wpreg <- glm(inPhishList~vsnMS, data=wp[wp$inPhishList==T | wp$inControlList==T,], family=binomial(link = "logit"))
wpreg2 <- glm(inPhishList~lgvsnMS, data=wp[wp$inPhishList==T | wp$inControlList==T,], family=binomial(link = "logit"))
summary(wpreg2)
exp(cbind(OR = coef(wpreg2), confint(wpreg2)))
with(wpreg2, null.deviance - deviance)
with(wpreg2, pchisq(null.deviance - deviance, df.null - df.residual, lower.tail = FALSE))

#Appendix 2 regression
#marExp <- read.csv('neims.csv',header=T)
marExp$lgexp <- log(marExp$exploits, 2)
marExp$lgexp[marExp$lgexp==-Inf] <- 0
marExp$lgMS <- log(marExp$MS*(1/100)*(1-.678)*106200000*0.85,2)
marExp <- marExp[marExp$MS>0,]
#marExp$lgMS[marExp$lgMS==-Inf] <- 0
marReg <- lm(lgexp~lgMS, data=marExp)
summary(marReg)
confint(marReg)
